In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline
In [2]:
df=pd.read_excel("customer_churn_large_dataset.xlsx",sheet_name='Sheet1')
print(df.head())
   CustomerID        Name   Age  Gender     Location  \
0         1.0  Customer_1  63.0    Male  Los Angeles   
1         2.0  Customer_2  62.0  Female     New York   
2         3.0  Customer_3  24.0  Female  Los Angeles   
3         4.0  Customer_4  36.0  Female        Miami   
4         5.0  Customer_5  46.0  Female        Miami   

   Subscription_Length_Months  Monthly_Bill  Total_Usage_GB  Churn  
0                        17.0         73.36           236.0    0.0  
1                         1.0         48.76           172.0    0.0  
2                         5.0         85.47           460.0    0.0  
3                         3.0         97.94           297.0    1.0  
4                        19.0         58.14           266.0    0.0  
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  float64
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  float64
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  float64
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  float64
 8   Churn                       100000 non-null  float64
dtypes: float64(6), object(3)
memory usage: 6.9+ MB
In [5]:
df.describe().transpose()
Out[5]:
count mean std min 25% 50% 75% max
CustomerID 100000.0 50000.500000 28867.657797 1.0 25000.75 50000.50 75000.25 100000.0
Age 100000.0 44.027020 15.280283 18.0 31.00 44.00 57.00 70.0
Subscription_Length_Months 100000.0 12.490100 6.926461 1.0 6.00 12.00 19.00 24.0
Monthly_Bill 100000.0 65.053197 20.230696 30.0 47.54 65.01 82.64 100.0
Total_Usage_GB 100000.0 274.393650 130.463063 50.0 161.00 274.00 387.00 500.0
Churn 100000.0 0.497790 0.499998 0.0 0.00 0.00 1.00 1.0
In [6]:
df.isnull().sum()
Out[6]:
CustomerID                    0
Name                          0
Age                           0
Gender                        0
Location                      0
Subscription_Length_Months    0
Monthly_Bill                  0
Total_Usage_GB                0
Churn                         0
dtype: int64
In [7]:
df=pd.DataFrame(df)
df.head()
Out[7]:
CustomerID Name Age Gender Location Subscription_Length_Months Monthly_Bill Total_Usage_GB Churn
0 1.0 Customer_1 63.0 Male Los Angeles 17.0 73.36 236.0 0.0
1 2.0 Customer_2 62.0 Female New York 1.0 48.76 172.0 0.0
2 3.0 Customer_3 24.0 Female Los Angeles 5.0 85.47 460.0 0.0
3 4.0 Customer_4 36.0 Female Miami 3.0 97.94 297.0 1.0
4 5.0 Customer_5 46.0 Female Miami 19.0 58.14 266.0 0.0
In [8]:
df.drop('CustomerID',axis=1,inplace=True)
df.drop('Name',axis=1,inplace=True)
df.head()
Out[8]:
Age Gender Location Subscription_Length_Months Monthly_Bill Total_Usage_GB Churn
0 63.0 Male Los Angeles 17.0 73.36 236.0 0.0
1 62.0 Female New York 1.0 48.76 172.0 0.0
2 24.0 Female Los Angeles 5.0 85.47 460.0 0.0
3 36.0 Female Miami 3.0 97.94 297.0 1.0
4 46.0 Female Miami 19.0 58.14 266.0 0.0
In [9]:
df['Gender'].value_counts()
Out[9]:
Female    50216
Male      49784
Name: Gender, dtype: int64
In [10]:
df['Churn'].value_counts()
Out[10]:
0.0    50221
1.0    49779
Name: Churn, dtype: int64
In [13]:
df['Location'].unique()
Out[13]:
array(['Los Angeles', 'New York', 'Miami', 'Chicago', 'Houston'],
      dtype=object)
In [11]:
sns.countplot(data=df,x='Churn')
Out[11]:
<AxesSubplot:xlabel='Churn', ylabel='count'>
In [12]:
px.histogram(df,x='Gender',y='Age',color='Churn')
In [14]:
px.histogram(df,x='Location',color='Churn')
In [15]:
px.scatter(df,x='Total_Usage_GB',y='Monthly_Bill',color='Gender')
In [16]:
sns.histplot(x='Monthly_Bill', data=df, hue='Churn', kde=True)
Out[16]:
<AxesSubplot:xlabel='Monthly_Bill', ylabel='Count'>
In [17]:
px.scatter(df,x='Total_Usage_GB',y='Monthly_Bill',color='Churn')
In [18]:
px.histogram(df,x='Total_Usage_GB',color='Churn')
In [19]:
px.histogram(df,x='Monthly_Bill',color='Churn')
In [20]:
px.scatter(df,x='Monthly_Bill',y='Subscription_Length_Months',color='Churn')
In [21]:
px.histogram(df,x='Subscription_Length_Months',color='Churn')
In [22]:
from sklearn.preprocessing import LabelEncoder
l1=LabelEncoder()
In [23]:
df["Gender"]=l1.fit_transform(df["Gender"])
df["Location"]=l1.fit_transform(df["Location"])
In [24]:
df.head()
Out[24]:
Age Gender Location Subscription_Length_Months Monthly_Bill Total_Usage_GB Churn
0 63.0 1 2 17.0 73.36 236.0 0.0
1 62.0 0 4 1.0 48.76 172.0 0.0
2 24.0 0 2 5.0 85.47 460.0 0.0
3 36.0 0 3 3.0 97.94 297.0 1.0
4 46.0 0 3 19.0 58.14 266.0 0.0
In [25]:
df.describe().transpose()
Out[25]:
count mean std min 25% 50% 75% max
Age 100000.0 44.027020 15.280283 18.0 31.00 44.00 57.00 70.0
Gender 100000.0 0.497840 0.499998 0.0 0.00 0.00 1.00 1.0
Location 100000.0 1.995840 1.411638 0.0 1.00 2.00 3.00 4.0
Subscription_Length_Months 100000.0 12.490100 6.926461 1.0 6.00 12.00 19.00 24.0
Monthly_Bill 100000.0 65.053197 20.230696 30.0 47.54 65.01 82.64 100.0
Total_Usage_GB 100000.0 274.393650 130.463063 50.0 161.00 274.00 387.00 500.0
Churn 100000.0 0.497790 0.499998 0.0 0.00 0.00 1.00 1.0
In [26]:
global_mean=df['Churn'].mean()
round(global_mean,2)
Out[26]:
0.5
In [27]:
mean=df.groupby('Age').Churn.mean()
risk=mean/global_mean
risk
Out[27]:
Age
18.0    1.025741
19.0    0.965628
20.0    0.993788
21.0    0.983790
22.0    0.978022
23.0    0.987938
24.0    0.949827
25.0    1.009271
26.0    0.990504
27.0    1.050381
28.0    1.013517
29.0    0.987921
30.0    0.969619
31.0    1.020819
32.0    1.036626
33.0    1.030950
34.0    0.965872
35.0    1.021316
36.0    1.038479
37.0    0.978657
38.0    1.008519
39.0    0.992223
40.0    0.973905
41.0    1.058832
42.0    0.982581
43.0    1.030858
44.0    0.993955
45.0    0.992894
46.0    0.994207
47.0    1.003896
48.0    0.990128
49.0    0.965432
50.0    0.949085
51.0    1.002878
52.0    0.998933
53.0    1.025111
54.0    1.015035
55.0    1.006554
56.0    0.944345
57.0    1.009612
58.0    1.003369
59.0    1.053411
60.0    0.999164
61.0    0.984972
62.0    1.018966
63.0    0.957890
64.0    0.983786
65.0    1.003892
66.0    1.032660
67.0    1.012261
68.0    1.011449
69.0    1.012845
70.0    0.985093
Name: Churn, dtype: float64
In [28]:
mean=df.groupby('Location').Churn.mean()
risk=mean/global_mean
risk
Out[28]:
Location
0    1.001017
1    0.986550
2    0.990356
3    1.010507
4    1.011791
Name: Churn, dtype: float64
In [29]:
mean=df.groupby('Subscription_Length_Months').Churn.mean()
risk=mean/global_mean
risk
Out[29]:
Subscription_Length_Months
1.0     1.011298
2.0     1.002064
3.0     1.013182
4.0     0.989243
5.0     0.970966
6.0     0.979473
7.0     1.008972
8.0     0.998079
9.0     0.989861
10.0    1.005166
11.0    1.000613
12.0    1.008066
13.0    1.018948
14.0    0.988466
15.0    0.979097
16.0    1.011327
17.0    0.999002
18.0    0.997938
19.0    1.013735
20.0    0.996737
21.0    0.985579
22.0    1.007029
23.0    1.007638
24.0    1.017383
Name: Churn, dtype: float64
In [30]:
mean=df.groupby('Monthly_Bill').Churn.mean()
risk=mean/global_mean
risk
Out[30]:
Monthly_Bill
30.00     0.803552
30.01     1.063524
30.02     0.860948
30.03     1.147931
30.04     1.129995
            ...   
99.96     1.187065
99.97     0.547876
99.98     1.374496
99.99     1.116044
100.00    0.669626
Name: Churn, Length: 7001, dtype: float64
In [31]:
mean=df.groupby('Total_Usage_GB').Churn.mean()
risk=mean/global_mean
risk
Out[31]:
Total_Usage_GB
50.0     0.960768
51.0     0.999679
52.0     1.163035
53.0     1.078843
54.0     0.994688
           ...   
496.0    0.936876
497.0    0.977046
498.0    0.989883
499.0    1.021464
500.0    0.869148
Name: Churn, Length: 451, dtype: float64
In [32]:
mean=df.groupby('Gender').Churn.mean()
risk=mean/global_mean
risk
Out[32]:
Gender
0    0.997879
1    1.002140
Name: Churn, dtype: float64
In [33]:
df.drop('Gender',axis=1,inplace=True)
df.drop('Subscription_Length_Months',axis=1,inplace=True)
df.drop('Location',axis=1,inplace=True)
df.drop('Age',axis=1,inplace=True)
df.head()
Out[33]:
Monthly_Bill Total_Usage_GB Churn
0 73.36 236.0 0.0
1 48.76 172.0 0.0
2 85.47 460.0 0.0
3 97.94 297.0 1.0
4 58.14 266.0 0.0
In [34]:
x=df.drop(["Churn"],axis=1).values
y=df["Churn"]
In [37]:
x
Out[37]:
array([[ 73.36, 236.  ],
       [ 48.76, 172.  ],
       [ 85.47, 460.  ],
       ...,
       [ 96.11, 251.  ],
       [ 49.25, 434.  ],
       [ 76.57, 173.  ]])
In [38]:
y
Out[38]:
0        0.0
1        0.0
2        0.0
3        1.0
4        0.0
        ... 
99995    1.0
99996    0.0
99997    1.0
99998    1.0
99999    1.0
Name: Churn, Length: 100000, dtype: float64
In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
In [36]:
sc = StandardScaler()
X = sc.fit_transform(x)
x_train , x_test , y_train , y_test = train_test_split(X , y , test_size=0.25,random_state=101)
In [39]:
x_train.shape, x_test.shape
Out[39]:
((75000, 2), (25000, 2))
In [42]:
lr = LogisticRegression()
lr.fit(x_train,y_train)
Out[42]:
LogisticRegression()
In [41]:
prediction=lr.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test,prediction))
[[9692 2909]
 [9740 2659]]
              precision    recall  f1-score   support

         0.0       0.50      0.77      0.61     12601
         1.0       0.48      0.21      0.30     12399

    accuracy                           0.49     25000
   macro avg       0.49      0.49      0.45     25000
weighted avg       0.49      0.49      0.45     25000

In [43]:
print("Logistic Regression accuray is {:.2f}%" .format(lr.score(x_test , y_test)*100) )
Logistic Regression accuray is 49.40%
In [44]:
from sklearn.neighbors import KNeighborsClassifier
In [45]:
error_rate=[]
for i in range (1,50):
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train,y_train)
    pred=knn.predict(x_test)
    error_rate.append(np.mean(pred != y_test))
In [46]:
plt.figure(figsize=(14,6))
plt.plot(range(1,50),error_rate,color='blue',marker='o',markerfacecolor='red',markersize=10)
plt.title('Error Rate Vs K Value')
plt.ylabel('Error Rate')
plt.xlabel('K Value')
Out[46]:
Text(0.5, 0, 'K Value')
In [49]:
knn=KNeighborsClassifier(n_neighbors=28)
knn.fit(x_train,y_train)
pred=knn.predict(x_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
[[7490 5111]
 [7181 5218]]
              precision    recall  f1-score   support

         0.0       0.51      0.59      0.55     12601
         1.0       0.51      0.42      0.46     12399

    accuracy                           0.51     25000
   macro avg       0.51      0.51      0.50     25000
weighted avg       0.51      0.51      0.50     25000

In [50]:
print("The accuracy of the KNN Model is {:.2f}%".format(knn.score(x_test,y_test)*100))
The accuracy of the KNN Model is 50.83%
In [51]:
from sklearn.svm import SVC
In [52]:
svm=SVC()
svm.fit(x_train,y_train)
svm.fit(x_train,y_train)
prediction=svm.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test,prediction))
[[8092 4509]
 [8055 4344]]
              precision    recall  f1-score   support

         0.0       0.50      0.64      0.56     12601
         1.0       0.49      0.35      0.41     12399

    accuracy                           0.50     25000
   macro avg       0.50      0.50      0.49     25000
weighted avg       0.50      0.50      0.49     25000

In [53]:
print("The accuracy of the Support Vector Machine Model is {:.2f}%".format(svm.score(x_test,y_test)*100))
The accuracy of the Support Vector Machine Model is 49.74%
In [54]:
from sklearn.tree import DecisionTreeClassifier
In [55]:
dtree= DecisionTreeClassifier(criterion='entropy')
dtree.fit(x_train,y_train)
prediction=dtree.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test,prediction))
[[6392 6209]
 [6197 6202]]
              precision    recall  f1-score   support

         0.0       0.51      0.51      0.51     12601
         1.0       0.50      0.50      0.50     12399

    accuracy                           0.50     25000
   macro avg       0.50      0.50      0.50     25000
weighted avg       0.50      0.50      0.50     25000

In [56]:
print("The accuracy of the Decicion Tree Model is {:.2f}%".format(dtree.score(x_test,y_test)*100))
The accuracy of the Decicion Tree Model is 50.38%
In [57]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
lr = LogisticRegression(solver='newton-cg')


weights = np.linspace(0.0,0.99,200)

param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

gridsearch = GridSearchCV(estimator= lr, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(x_train, y_train)

sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
D:\anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning:

Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.

Out[57]:
Text(0.5, 1.0, 'Scoring for different class weights')
In [58]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='newton-cg', class_weight={0: 0.4, 1: 0.6})
lr.fit(x_train, y_train)

pred_test = lr.predict(x_test)
In [59]:
print(confusion_matrix(y_test,pred_test))
print(classification_report(y_test,pred_test))
[[    0 12601]
 [    0 12399]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     12601
         1.0       0.50      1.00      0.66     12399

    accuracy                           0.50     25000
   macro avg       0.25      0.50      0.33     25000
weighted avg       0.25      0.50      0.33     25000

D:\anaconda\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

D:\anaconda\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

D:\anaconda\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

In [61]:
# save the model to disk
import pickle
filename = 'DTree_model.sav'
pickle.dump(dtree, open(filename, 'wb'))
In [ ]: